Adapted and edited by Joseph White
This notebooks will help me talk through a bit of material regarding
spatial data, with some introduction to
spatial joins.
Key topics for today:
sf packagetmap packagetidycensus packageStandards:
library(knitr)
library(tidyverse)
library(janitor)
library(lubridate) # because we will probably see some dates
library(here) # a package I haven't taught you about before that doesn't do much, but ....
library(rnaturalearth)
library(WDI)
library(tigris)
library(rgdal)
library(sp)
Some additional packages focused on today’s work:
library(sf) # working with simple features - geospatial
library(tmap)
library(tidycensus)
sf: https://r-spatial.github.io/sf/articles/tmap: https://cran.r-project.org/web/packages/tmap/vignettes/tmap-getstarted.htmltidycensus package: https://walker-data.com/tidycensus/index.htmltidycensus : https://walker-data.com/census-r/index.htmlOur first data source comes from opendata.dc
https://opendata.dc.gov/datasets/DCGIS::dc-health-planning-neighborhoods/about
I will use the GeoJSON file. (Newer, not necessarily better, but … a single file. Not smaller, but … this one is not big.)
Data is easily readable
neigh=st_read(here("DC_Health_Planning_Neighborhoods_joey.geojson")) %>% clean_names()
Reading layer `DC_Health_Planning_Neighborhoods' from data source
`/Users/joewhite/Desktop/R-Studio/ds241/ds241_portfolio/DC_Health_Planning_Neighborhoods_joey.geojson'
using driver `GeoJSON'
Simple feature collection with 51 features and 8 fields
Geometry type: POLYGON
Dimension: XY
Bounding box: xmin: -77.11976 ymin: 38.79165 xmax: -76.9094 ymax: 38.99556
Geodetic CRS: WGS 84
class(neigh)
[1] "sf" "data.frame"
plot(neigh)
df1=tibble(fruit=c("apple","banana","cherry"),cost=c(1.5,1.2,2.25))
df2=tibble(fruit=c("apple","apple","cherry","lemon"),
desert=c("pie","cobbler","cobbler","cheesecake"),
cal=c(400,430,500,550))
df1
df2
left_join(df1,df2,by="fruit")
Covid case information is available from opendatadc:
https://opendata.dc.gov/datasets/DCGIS::dc-covid-19-total-positive-cases-by-neighborhood/about
Read cases information:
df_c=read_csv(here("DC_COVID-19_Total_Positive_Cases_by_Neighborhood_joey.csv")) %>% clean_names()
df_cases=df_c %>%
filter(as_date(date_reported) == "2022-02-22") %>%
separate(neighborhood,into=c("code","name"),sep = ":") %>%
mutate(code=case_when(
code=="N35" ~"N0",
TRUE ~ code
)) %>%
select(-objectid,-date_reported)
neigh2=left_join(neigh,df_cases,by=c("code"))
tmap_mode("view")
tmap mode set to interactive viewing
tm_shape(neigh2) +tm_polygons("total_positives",alpha=.5)
Let’s get some data using tidycensus. Need an API key https://api.census.gov/data/key_signup.html
census_api_key("d44395e2fa101f82260fae6b845676d71f017b70")
To install your API key for use in future sessions, run this function with `install = TRUE`.
#what variables
v20 = load_variables(2018,"acs5")
# median_family_income=" B06011_001"
# all "B00001_001"
#black "B02009_001"
Get some data:
df_cencus=get_acs(geography = "tract",
variables=c("median_inc"="B06011_001",
"pop"="B01001_001",
"pop_black"="B02009_001"),
state="DC",geometry=TRUE,year=2018)
Getting data from the 2014-2018 5-year ACS
Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
Using FIPS code '11' for state 'DC'
|
| | 0%
|
|=============================== | 42%
|
|========================================== | 57%
|
|=============================================== | 64%
|
|================================================= | 66%
|
|=========================================================== | 79%
|
|==========================================================================| 100%
class(df_cencus)
[1] "sf" "data.frame"
plot(df_cencus)
It’s in long format. Let’s make it wide.
df_cens=df_cencus %>% select(-moe) %>% spread(variable,estimate)
tm_shape(df_cens) +tm_polygons("median_inc",alpha=.5)
tm_shape(neigh2) +tm_borders(col="blue",lwd=5,alpha=.2)+
tm_shape(df_cens) +tm_borders(col="red",lwd=1,alpha=.3)
df_cens_adj=df_cens %>% st_transform(4326)
df_j=st_join(df_cens_adj,neigh2,largest=TRUE)
Warning: attribute variables are assumed to be spatially constant throughout all geometries
Other order?:
Since we want the geometry for the NEIGHBORHOODS, we need a to work a little harder:
df1=df_j %>% select(median_inc,pop,pop_black,objectid) %>%
group_by(objectid) %>%
summarise(pop_n=sum(pop),
pop_black_n=sum(pop_black),
adj_median_income=sum(pop*median_inc)/pop_n)
plot(df1)
#df2=left_join(neigh2,df1)
df2=left_join(neigh2,df1 %>% st_set_geometry(NULL))
Joining, by = "objectid"
df2=df2 %>% mutate(black_perc=pop_black_n/pop_n, covid_rate=total_positives/pop_n)
tm_shape(df2)+tm_polygons(c("adj_median_income","covid_rate","black_perc"))
df2 %>% filter(objectid!=30) %>% tm_shape()+tm_polygons(c("adj_median_income","covid_rate","black_perc"),alpha=.4)
#find where people ride bikes (bikes started in each ‘district’) — Joseph Whites work
bike_data <- read_csv("202209-capitalbikeshare-tripdata.csv") %>% clean_names()
Rows: 386085 Columns: 13── Column specification ────────────────────────────────────────────────────────────
Delimiter: ","
chr (5): ride_id, rideable_type, start_station_name, end_station_name, member_c...
dbl (6): start_station_id, end_station_id, start_lat, start_lng, end_lat, end_lng
dttm (2): started_at, ended_at
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bike_data_sf <- bike_data %>%
mutate_at(vars(start_lat, start_lng), as.numeric) %>%
st_as_sf(
coords = c("start_lat", "start_lng"),
agr = "constant",
crs = "4326"
) %>%
sample_n(1000)
st_crs(bike_data_sf)
Coordinate Reference System: NA
st_crs(neigh2)
Coordinate Reference System:
User input: WGS 84
wkt:
GEOGCRS["WGS 84",
DATUM["World Geodetic System 1984",
ELLIPSOID["WGS 84",6378137,298.257223563,
LENGTHUNIT["metre",1]]],
PRIMEM["Greenwich",0,
ANGLEUNIT["degree",0.0174532925199433]],
CS[ellipsoidal,2],
AXIS["geodetic latitude (Lat)",north,
ORDER[1],
ANGLEUNIT["degree",0.0174532925199433]],
AXIS["geodetic longitude (Lon)",east,
ORDER[2],
ANGLEUNIT["degree",0.0174532925199433]],
ID["EPSG",4326]]
st_crs(bike_data_sf) <- 4326
bike_data_sf_1 <- bike_data_sf %>%
select(geometry) %>%
rename(geom_points = geometry)
neigh2_1 <- neigh2 %>%
select(geometry)
df_plz = st_join(bike_data_sf_1, neigh2_1, join = st_within)
#df_bikes_count <- count(as_tibble(df_plz), )
https://dw-rowlands.github.io/Job_Density_and_Commutes/Job_Density_and_Commutes.html
https://walker-data.com/census-r/index.html
What other interesting questions - is peak in bike data actually people going to work or tourist? - are casual riders in the more touristy areas? open data dc, tourist data, reverse geo coding to find location of monuments :):
Question to answer: Is there a spatial patter driven associated with member/casual? - look on open data dc site - could be some data on tourist activity